In [ ]:
import pandas as pd
import numpy as np
import datetime
from datetime import date
from dateutil.rrule import rrule, DAILY
from __future__ import division
import geoplotlib as glp
from geoplotlib.utils import BoundingBox, DataAccessObject
pd.set_option('display.max_columns', None)
%matplotlib inline
In [ ]:
start_date = date(2012, 7, 1)
end_date = date(2016, 2, 29)
# data = pd.DataFrame()
frames = []
url_template = 'https://www.wunderground.com/history/airport/KNYC/%s/%s/%s/DailyHistory.html?req_city=New+York&req_state=NY&req_statename=New+York&reqdb.zip=10001&reqdb.magic=4&reqdb.wmo=99999&format=1.csv'
month = ""
for dt in rrule(DAILY, dtstart=start_date, until=end_date):
if (month != dt.strftime("%m")):
month = dt.strftime("%m")
print 'Downloading to memory: ' + dt.strftime("%Y-%m")
frames.append(pd.read_csv(url_template % (dt.strftime("%Y"),dt.strftime("%m"), dt.strftime("%d"))))
print "Saving data to csv..."
data = pd.concat(frames)
data.to_csv('weather_data_nyc.csv', sep=',')
In [ ]:
from datetime import datetime
from dateutil import tz
weather = pd.read_csv('datasets/weather_data_nyc_clean.csv')
def UTCtoActual(utcDate):
from_zone = tz.gettz('UTC')
to_zone = tz.gettz('America/New_York')
utc = datetime.strptime(utcDate.DateUTC, '%m/%d/%Y %H:%M:%S')\
.replace(tzinfo=from_zone)\
.astimezone(to_zone)
s = pd.Series([utc.year, utc.month, utc.day, utc.hour])
s.columns = ['Year', 'Month', 'Day', 'Hour']
return s
#weather['DateActual'] = weather.DateUTC.map()
In [ ]:
weather[['Year', 'Month', 'Day', 'Hour']] = weather.apply(UTCtoActual, axis=1)
weather.to_csv('datasets/weather_data_nyc_clean2.csv')
In [ ]:
incidents = pd.read_csv('datasets/NYPD_Motor_Vehicle_Collisions.csv')
weather = pd.read_csv('datasets/weather_data_nyc_clean2.csv')
weather.head(1)
In [ ]:
weather[(weather.Year == 2015) & (weather.Month == 11) & (weather.Day == 27)]
In [ ]:
features0 = ['Conditions', 'TemperatureC']
features = ['Conditions', 'Precipitationmm',\
'TemperatureC', 'VisibilityKm']
def lookup_weather2(year, month, day, hour):
w = weather[(weather.Year == year) & (weather.Month == month) & (weather.Day == day) & (weather.Hour == hour)]
return w
def lookup_weather(date, time):
month = int(date.split('/')[0])
day = int(date.split('/')[1])
year = int(date.split('/')[2])
hour = int(time.split(':')[0])
d = lookup_weather2(year, month, day, hour).head(1)
if (d.empty):
dt_back = datetime.datetime(year, month, day, hour) - datetime.timedelta(hours=1)
dt_forward = datetime.datetime(year, month, day, hour) + datetime.timedelta(hours=1)
d_back = lookup_weather2(dt_back.year, dt_back.month, dt_back.day, dt_back.hour)
if (not d_back.empty): return d_back
d_forward = lookup_weather2(dt_forward.year, dt_forward.month, dt_forward.day, dt_forward.hour)
if (not d_forward.empty): return d_forward
return d
def merge_weather(incident):
date = incident.DATE
time = incident.TIME
#print "0"
w = lookup_weather(date, time)
#[unnamed, condition, dateUTC, Dew, Events, Gust, Humidity,Precipitationmm,Sea_Level_PressurehPa, TemperatureC] = w.values[0]
#print "1"
try:
#print "2"
#print w
con = "-"
temp = "-"
rainmm = "-"
viskm = "-"
#print "2.5"
if (not pd.isnull(w['Conditions'].iloc[0])):
con = w['Conditions'].iloc[0]
if (not pd.isnull(w['TemperatureC'].iloc[0])):
temp = w['TemperatureC'].iloc[0]
if (not pd.isnull(w['Precipitationmm'].iloc[0])):
rainmm = w['Precipitationmm'].iloc[0]
if (not pd.isnull(w['VisibilityKm'].iloc[0])):
viskm = w['VisibilityKm'].iloc[0]
#print 'con %s, temp %s, rainmm %s, viskm %s' % (con, temp, rainmm, viskm)
#print "2.75"
s = pd.Series([con, rainmm, temp, viskm])
#print "3"
#print str(len(w.values[0]))
#s = pd.Series(w.values[0])
#s = pd.Series([w['Conditions'].iloc[0], w['Dew PointC'].iloc[0], w['Gust SpeedKm/h'].iloc[0]])
#s.columns = features
return s
except:
#print "4"
print date + "x" + time
s = pd.Series([None,None,None,None])
#s = pd.Series(["1","2","3","4","5","6","7","8","9"])
#s = pd.Series([])
#s.columns = features
return s
#lookup_weather2(2016, 2, 14, 7)
#lookup_weather('03/14/2016', '3:27').values[0]
#[unnamed, condition, dateUTC, Dew, Events, Gust, Humidity,Precipitationmm,Sea_Level_PressurehPa, TemperatureC] = lookup_weather('01/27/2016', '3:27').values[0]
In [ ]:
print "Applying weather data to incidents..."
incidents[features] = incidents[incidents.DATE.str.split('/').str.get(2) != '2016'].apply(merge_weather, axis=1)
print "Saving weather in-riched incident data..."
incidents.to_csv('datasets/NYPD_Motor_Vehicle_Collisions_weather3.csv', sep=',')
In [ ]:
incidents[incidents.DATE.str.split('/').str.get(2) == '2016']
In [ ]:
# Read dataset
incidents = pd.read_csv('datasets/NYPD_Motor_Vehicle_Collisions_weather3.csv')
# Filter 2016 incidents
incidents = incidents[(incidents.DATE.str.split('/').str.get(2) != '2016')
& (pd.notnull(incidents.Conditions))]
In [ ]:
# Distribution of incidents by weather conditions
ys = []
xs = []
for c in incidents.Conditions.unique():
mask = (incidents.Conditions == c)
filtered_incidents = incidents[mask]
ys.append(len(filtered_incidents.index))
xs.append(c)
df = pd.DataFrame(pd.Series(ys, index=xs, name="Incidents by weather conditions").sort_values())
df.plot(kind='barh', figsize=(8,8))
In [ ]:
df
Now lets try to find out if there are any condition that causes more incidents than others. We do this by plotting out heatmaps to get an idea of the distributions in the NYC area
In [ ]:
def plot_zip_weather(condition, data):
ys = []
xs = []
for z in data['ZIP CODE'].unique():
mask = (data['ZIP CODE'] == z)
filtered_incidents = data[mask]
ys.append(len(filtered_incidents.index))
xs.append(z)
df = pd.DataFrame(pd.Series(ys, index=xs, name="%s incidents by zip code" % condition).sort_values())
df.plot(kind='barh', figsize=(8,32))
def draw_kde(data):
bbox = BoundingBox(north=data.LATITUDE.max()-0.055,\
west=data.LONGITUDE.min()+0.055,\
south=data.LATITUDE.min()-0.055,\
east=data.LONGITUDE.max()+0.055)
coords = {'lat': data.LATITUDE.values.tolist(), 'lon': data.LONGITUDE.values.tolist()}
glp.kde(coords, bw=5, cut_below=1e-4)
glp.set_bbox(bbox)
glp.inline()
def plot_stuff(conditions, data):
print "%s conditions" % conditions
plot_zip_weather(conditions, data)
draw_kde(data)
snowy = incidents[incidents['Conditions'].str.contains('Snow')]
rainy = incidents[incidents['Conditions'].str.contains('Rain')]
clear = incidents[incidents['Conditions'].str.contains('Clear')]
cloudy = incidents[(incidents['Conditions'].str.contains('Cloud')) | (incidents['Conditions'].str.contains('Overcast'))]
haze = incidents[incidents['Conditions'].str.contains('Haze')]
plot_stuff("Snowy", snowy)
plot_stuff("Rainy", rainy)
plot_stuff("Clear", clear)
plot_stuff("Cloudy", cloudy)
plot_stuff("Hazy", haze)
Finding the ratio between conditions that resulted in an incident. Borough level
In [ ]:
# What is the probability of an incident based on the weather condition?
# Normalize incidents based on the conditions.
from collections import Counter
ConditionIncidentCounter = Counter(incidents.Conditions.values)
p_incident = {}
for k,v in ConditionIncidentCounter.most_common():
p_incident[k] = v/len(incidents)
p_incident
# Do the same again but for individual areas of NYC
p_incident_district = {}
l = len(incidents)
for district in incidents[pd.notnull(incidents.BOROUGH)].BOROUGH.unique():
filtered = incidents[incidents.BOROUGH == district]
counter = Counter(filtered.Conditions.values)
p_incident_district[district] = {}
for k,v in counter.most_common():
p_incident_district[district][k] = v / len(list(counter.elements()));
p_incident_district
# Are there any areas in NYC that experience incidents based
# on a condition unusually higher or lower compared to other areas?
# Calculate the ratio of incidents based on the condition.
def calcRatioForDistrict(districtCounter, overAllCounter, district):
ys = []
xs = []
for con in incidents.Conditions.unique():
ys.append(districtCounter[con] / overAllCounter[con])
xs.append(con)
return pd.Series(ys, index=xs)
series = {}
for b in incidents[pd.notnull(incidents.BOROUGH)].BOROUGH.unique():
series[b] = calcRatioForDistrict(p_incident_district[b], p_incident, b)
df = pd.DataFrame(series)
df.plot(kind="bar", subplots=True, figsize=(14,14),layout=(7,2), legend=False,sharey=True)
Let's try to look at zip codes in Brooklyn only
In [ ]:
# What is the probability of an incident based on the weather condition?
# Normalize incidents based on the conditions.
from collections import Counter
borough = incidents[incidents.BOROUGH == 'MANHATTAN']
ConditionIncidentCounter = Counter(borough.Conditions.values)
p_incident = {}
for k,v in ConditionIncidentCounter.most_common():
p_incident[k] = v/len(borough)
p_incident
# Do the same again but for individual areas of NYC
p_incident_borough_zip = {}
l = len(borough)
for z in borough[pd.notnull(incidents['ZIP CODE'])]['ZIP CODE'].unique():
filtered = borough[incidents['ZIP CODE'] == z]
counter = Counter(filtered.Conditions.values)
# z = str(z).split(".")[0]
p_incident_borough_zip[z] = {}
for k,v in counter.most_common():
p_incident_borough_zip[z][k] = v / len(list(counter.elements()));
p_incident_borough_zip
# Are there any areas in NYC that experience incidents based
# on a condition unusually higher or lower compared to other areas?
# Calculate the ratio of incidents based on the condition.
def calcRatioForDistrict(districtCounter, overAllCounter, district):
ys = []
xs = []
for con in incidents.Conditions.unique():
if (con in districtCounter):
ys.append(districtCounter[con] / overAllCounter[con])
else:
ys.append(0)
xs.append(con)
return pd.Series(ys, index=xs)
series = {}
for z in borough[pd.notnull(incidents['ZIP CODE'])]['ZIP CODE'].unique():
series[z] = calcRatioForDistrict(p_incident_borough_zip[z], p_incident, b)
df = pd.DataFrame(series)
In [ ]:
df.plot(kind="bar", subplots=True, figsize=(14,100), layout=(50,2), legend=False, sharey=False)
In [ ]:
worst_day = incidents.DATE.value_counts().index[0]
worst_day_count = incidents.DATE.value_counts()[0]
incidents[incidents.DATE == worst_day]
In [ ]:
incidents.DATE.value_counts()
In [ ]:
incidents['CONTRIBUTING FACTOR VEHICLE 1'].unique()
In [ ]:
# Read dataset
incidents = pd.read_csv('datasets/NYPD_Motor_Vehicle_Collisions_weather4.csv')
# Filter 2016 incidents
incidents = incidents[(incidents.DATE.str.split('/').str.get(2) != '2016')
& (pd.notnull(incidents.Conditions))]
In [ ]:
def count_contributing(cont):
temp = incidents[(incidents['CONTRIBUTING FACTOR VEHICLE 1'] == cont) | \
(incidents['CONTRIBUTING FACTOR VEHICLE 2'] == cont) | \
(incidents['CONTRIBUTING FACTOR VEHICLE 3'] == cont) | \
(incidents['CONTRIBUTING FACTOR VEHICLE 4'] == cont) | \
(incidents['CONTRIBUTING FACTOR VEHICLE 5'] == cont) ]
return temp.shape[0]
print "Accidents caused by Pavement Slippery: %s" % count_contributing('Pavement Slippery')
print "Accidents caused by Glare: %s " % count_contributing('Glare')
print "Accidents caused by Pavement Defective: %s " % count_contributing('Pavement Defective')
There seems to be a lot of incidents caused by slippery pavement. Let's look at the weather conditions for those incidents.
In [ ]:
weather_incidents = incidents[(incidents['CONTRIBUTING FACTOR VEHICLE 1'] == 'Pavement Slippery') | \
(incidents['CONTRIBUTING FACTOR VEHICLE 2'] == 'Pavement Slippery') | \
(incidents['CONTRIBUTING FACTOR VEHICLE 3'] == 'Pavement Slippery') | \
(incidents['CONTRIBUTING FACTOR VEHICLE 4'] == 'Pavement Slippery') | \
(incidents['CONTRIBUTING FACTOR VEHICLE 5'] == 'Pavement Slippery') ]
In [ ]:
# Distribution of incidents by weather conditions
ys = []
xs = []
for c in weather_incidents.Conditions.unique():
mask = (weather_incidents.Conditions == c)
filtered_incidents = weather_incidents[mask]
ys.append(filtered_incidents.shape[0])
xs.append(c)
df = pd.DataFrame(pd.Series(ys, index=xs, name="Weather conditions during 'slippery pavement' based incidents").sort_values())
df.plot(kind='barh', figsize=(8,8))
# Export to json for d3 viz
from collections import OrderedDict
import json
with open('datasets/slippery_pavement.json', 'w') as fp:
json.dump(OrderedDict(sorted(dict(zip(xs, ys)).items(), key=lambda x: x[1], reverse=True)), fp)
Okay, the overcast and clear weather still are the top 2. The assumption that the type of incidents are caused by weather conditions might still hold true. It could be that top 2 are caused by pavement conditions independent of the weather, such as water or oil on the roads. In any case, lets try to plot out where these incidents occur.
In [ ]:
def draw_dot(data, type_color):
bbox = BoundingBox(north=incidents.LATITUDE.max()-0.055,\
west=incidents.LONGITUDE.min()+0.055,\
south=incidents.LATITUDE.min()-0.055,\
east=incidents.LONGITUDE.max()+0.055)
gridDots = {'lat': data.LATITUDE.values.tolist(), 'lon': data.LONGITUDE.values.tolist()}
glp.set_bbox(bbox)
glp.dot(gridDots, color=type_color)
def get_spaced_colors(n):
max_value = 16581375 #255**3
interval = int(max_value / n)
colors = [hex(I)[2:].zfill(6) for I in range(0, max_value, interval)]
return [[int(i[:2], 16), int(i[2:4], 16), int(i[4:], 16), 255] for i in colors]
colormap = get_spaced_colors(weather_incidents['Conditions'].unique().size)
for idx, wi in enumerate(weather_incidents['Conditions'].unique().tolist()):
filtered = weather_incidents[weather_incidents['Conditions'] == wi]
print "%s %s" % (wi, str(len(filtered.index)))
draw_dot(filtered, colormap[idx])
draw_dot(filtered, 'r')
glp.inline()
#glp.inline()
In [ ]:
bbox = BoundingBox(north=incidents.LATITUDE.max()-0.055,\
west=incidents.LONGITUDE.min()+0.055,\
south=incidents.LATITUDE.min()-0.055,\
east=incidents.LONGITUDE.max()+0.055)
glp.set_bbox(bbox)
glp.kde({'lat': weather_incidents.LATITUDE.values.astype('float'), 'lon': weather_incidents.LONGITUDE.values.astype('float')},bw=5, cut_below=1e-4)
glp.inline()
Looking at the intersections we can find those most dangerous based on the number of incidents happening there which are in some way caused by slippery pavement.
In [ ]:
top10 = weather_incidents.LOCATION.value_counts()[:20]
In [ ]:
top10.to_csv('datasets/top20slippery')
Ignoring incidents happening outside intersections the top 3 looks like this:
What we find is - that atleast for top 3 - incidents occouring because of slippery pavement happens because of steep angled roads leading into a intersection. Or very bad road pavement conditions.
In [ ]:
locations = weather_incidents[weather_incidents.LOCATION.isin(top10.index)].drop_duplicates('LOCATION','first')\
[['TIME','BOROUGH','ZIP CODE','LATITUDE','LONGITUDE','LOCATION','ON STREET NAME','CROSS STREET NAME']]
In [ ]:
loca = locations.copy()
In [ ]:
def m(r):
return top10[top10.index == r.LOCATION].iloc[0]
loca['COUNT'] = loca.apply(m, axis=1)
In [ ]:
loca.sort_values(by='COUNT', ascending=False).to_csv('../app/datasets/slippery.csv', sep=',')
In [ ]:
lightsnow = incidents[incidents['Conditions'] == 'Light Snow']
print "Accidents happening because of light snow: %s" % str(lightsnow.size)
print "Injuries: %s" % lightsnow['NUMBER OF PERSONS INJURED'].sum()
print "Killed: %s" % lightsnow['NUMBER OF PERSONS KILLED'].sum()
print "Top intersections:"
lightsnow.LOCATION.value_counts()[:3]
In [ ]:
lightrain = incidents[incidents['Conditions'] == 'Light Rain']
print "Accidents happening because of light rain: %s" % str(lightrain.size)
print "Injuries: %s" % lightrain['NUMBER OF PERSONS INJURED'].sum()
print "Killed: %s" % lightrain['NUMBER OF PERSONS KILLED'].sum()
print "Top intersections:"
lightrain.LOCATION.value_counts()[:3]
In [ ]: